#Rat Transcript Levels are heritable
Here is the plot for cis heritability from BSLMM of rat gene expression with predicted R2 overlayed in red dots
Ac_total <- load_pve(Ac_total)
plt_1 <- (ggplot(data = Ac_total, aes(x = index))
+ geom_point(aes(x=index, y=R2), colour = "red", size = 0.2)
+ geom_line(aes(y = point_estimate))
+ geom_hline(yintercept = 0, linetype=2)
+ labs(x = 'Genes Sorted by PVE',
y = 'Proportion of Variance Explained',
title = "Ac")
+ ylim(-0.5,1)
+ annotate("text", x = 300, y = 0.9, label = "Mean h2 = 0.09822", size = 2)
+ annotate("text", x = 370, y = 0.8, label = "Mean r2 = 0.08507938", size = 2)
+ annotate("text", x= 5000, y = 0.75, label = "n(rats) = 78", size = 5))
plt_1
Looking at the graph of correlation between different mixing parameters, it appears that non zero outperforms zero.
tempo <- read_tsv( "/Users/natashasanthanam/Github/rat-genomic-analysis/data/rat_elasticNet_cor.txt", col_names = TRUE)
## Rows: 3844 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): gene
## dbl (11): 1, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data_long <- tempo %>% pivot_longer(!gene, names_to = "value", values_to = "count")
plt_2 <- ggplot(data_long, aes(x = as.numeric(value), y = count)) + geom_smooth(show.legend = FALSE, se=T, size = .2) + xlab(expression(paste("elastic net mixing parameter (",alpha, ")"))) + ylab(expression(paste("10-fold cross-validation R"))) + theme_bw(base_size = 12) + coord_cartesian(ylim=c(0,0.4),xlim=c(-0.02,1.02))
plt_3 = ggplot(tempo, aes(x = `0`, y = `0.5`)) + geom_hex(bins = 50) +
geom_abline(slope = 1, intercept = 0, color = "darkgrey", size = 0.8) +
ylab("cor for mixing paramter = 0.5" ) +
xlab("cor for mixing paramter = 0")
plt_2
## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
plt_3
GWAS.df <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/GWAS_elasticnet_data.figure.txt", col_names = TRUE)
## Rows: 49 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): tissue
## dbl (2): n.genes, n.samples
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tissue <- c("Ac", "Il", "Lh", "Pl", "Vo")
n.genes <- c(8567, 8856, 8244, 8315, 8821)
n.samples <- c(78, 83, 83, 81, 82)
rats.df <- data.frame(tissue, n.genes , n.samples)
GWAS.df <- rbind(GWAS.df, rats.df)
GWAS.df <- GWAS.df %>% mutate(species = ifelse(tissue=="Ac" | tissue =="Il" | tissue =="Pl" | tissue=="Lh" | tissue=="Vo","rat","human"))
ggplot(GWAS.df, mapping = aes(n.samples, n.genes)) + geom_point(size = ifelse(GWAS.df$species == "human", 1.2, 1.5), shape = ifelse(GWAS.df$species == "human", 1, 20), color = ifelse(GWAS.df$species == "human", "dimgrey", "black")) + geom_label_repel(aes(label=ifelse(species == "rat", as.character(tissue),'')), box.padding = 0.35, point.padding = 0.5) + xlab("Number of Individuals") + ylab("Number of Genes Predicted") + theme(legend.position = "None")
dir <- "/Users/natashasanthanam/Box/imlab-data/data-Github/rat-genomic-analysis/sql/"
filelist <- list.files(dir, pattern = ".db", full.names = TRUE)
filename <- filelist[5]
sqlite.driver <- dbDriver("SQLite")
conn <- dbConnect(RSQLite::SQLite(), filename)
rat_pred <- dbGetQuery(conn, 'select * from extra')
rat_pred <- rat_pred %>% dplyr::select(c(gene, pred.perf.R2)) %>% rename(Vo = pred.perf.R2)
dbDisconnect(conn)
filelist <- filelist[1:(length(filelist) - 1)]
for(fila in filelist) {
filename <- fila
tis <- substr(fila, 78, 79)
sqlite.driver <- dbDriver("SQLite")
conn <- dbConnect(RSQLite::SQLite(), filename)
extra <- dbGetQuery(conn, 'select * from extra') %>% dplyr::select(c(gene, R2))
colnames(extra)[2] <- tis
rat_pred <- inner_join(rat_pred, extra, by = "gene")
dbDisconnect(conn)
}
rownames(rat_pred) <- rat_pred$gene
rat_pred <- rat_pred %>% dplyr::select(-c(gene))
pairs(rat_pred)
predict_GTEx_tiss <- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/pred_R2_betw_GTEx_brain_tissues.RDS")
pairs(predict_GTEx_tiss)
# PrediXcan results
full_df <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/rat_metabolic_traits_Ac_full_assocs.txt", col_names = TRUE)
## Rows: 73920 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): gene, metabolic_trait
## dbl (5): effect, se, zscore, pvalue, n_samples
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tempo_df <- full_df %>% filter(pvalue < 5.836349e-06)
#589 sig genes
tempo_df %>% group_by(gene) %>% summarise(n = n())
## # A tibble: 589 × 2
## gene n
## <chr> <int>
## 1 ENSRNOG00000000245 1
## 2 ENSRNOG00000000546 1
## 3 ENSRNOG00000000571 1
## 4 ENSRNOG00000000699 1
## 5 ENSRNOG00000000720 1
## 6 ENSRNOG00000000763 1
## 7 ENSRNOG00000000775 1
## 8 ENSRNOG00000000778 1
## 9 ENSRNOG00000000823 1
## 10 ENSRNOG00000000867 1
## # … with 579 more rows
#all 10 traits
tempo_df %>% group_by(metabolic_trait) %>% summarise(n = n())
## # A tibble: 10 × 2
## metabolic_trait n
## <chr> <int>
## 1 bmi_bodylength_w_tail 18
## 2 bmi_bodylength_wo_tail 116
## 3 bodylength_w_tail 31
## 4 bodylength_wo_tail 97
## 5 bodyweight 14
## 6 epifat 31
## 7 fasting_glucose 23
## 8 parafat 218
## 9 retrofat 58
## 10 tail_length 115
gene_annot <- readRDS("/Users/natashasanthanam/Github/rat-genomic-analysis/data/gene_annotation.RDS") %>% dplyr::select(c(chr, gene_id, gene_name, start, end)) %>% rename(gene = gene_id)
tempo_manhatt <- inner_join(gene_annot, full_df, by = "gene")
tempo_manhatt$chr <- as.numeric(tempo_manhatt$chr)
manhattan(tempo_manhatt, chr="chr", bp="start", snp="gene", p="pvalue", ylim = c(0, 10), suggestiveline = F, genomewideline = F, main = "Manhattan plot of Significant PrediXcan associations")
abline(h= 5.233859, col="red")
#only keep R2 > 0.1
tempo_df <- full_df %>% filter(pvalue < 1e-03)
#qqplot
qqplot_by_group <- function(pval, group, pval_cutoff = 1, ...) {
n <- length(pval)
pexp <- rank(pval) / n
df <- data.frame(p.val = pval, grp = group) %>% group_by(grp) %>% mutate(p.exp = pval_cutoff * rank(p.val) / (n() + 1)) %>% ungroup()
p <- ggplot(df) +
geom_point(aes(x = -log10(p.exp), y = -log10(p.val), color = grp), ...) +
geom_hline(yintercept = -log10(0.05 / n)) +
geom_abline(slope = 1, intercept = 0, linetype = 2)
return(p)
}
qqplot_by_group(tempo_df$pvalue, group = 1, pval_cutoff = 1e-3)
bmi_bodylength_w_tail <- full_df %>% filter(metabolic_trait == "bmi_bodylength_w_tail")
bmi_w_tail_manhatt <- inner_join(gene_annot, bmi_bodylength_w_tail, by = "gene")
bmi_w_tail_manhatt$chr <- as.numeric(bmi_w_tail_manhatt$chr)
manhattan(bmi_w_tail_manhatt, chr="chr", bp="start", snp="gene", p="pvalue", ylim = c(0, 10), suggestiveline = F, genomewideline = F, main = "Manhattan plot for BMI Bodylength with Tail")
abline(h= 5.233859, col="red")
bodyweight <- full_df %>% filter(metabolic_trait == "bodyweight")
bodyweight_manhat <- inner_join(gene_annot, bodyweight, by = "gene")
bodyweight_manhat$chr <- as.numeric(bodyweight_manhat$chr)
manhattan(bodyweight_manhat, chr="chr", bp="start", snp="gene", p="pvalue", ylim = c(0, 10), suggestiveline = F, genomewideline = F, main = "Manhattan plot for Bodyweight")
abline(h= 5.233859, col="red")
retrofat <- full_df %>% filter(metabolic_trait == "retrofat")
retrofat_manhat <- inner_join(gene_annot, retrofat, by = "gene")
retrofat_manhat$chr <- as.numeric(retrofat_manhat$chr)
manhattan(retrofat_manhat, chr="chr", bp="start", snp="gene", p="pvalue", ylim = c(0, 10), suggestiveline = F, genomewideline = F, main = "Manhattan plot for Retrofat")
abline(h= 5.233859, col="red")
Single nested Elastic net R2 seems to overestimate R2 when compared using a double nested method
EN_lasso <- inner_join(cis_lasso_R2, elasticnet_R2, by = "gene")
cor.test(EN_lasso$R2.x, EN_lasso$R2.y)
##
## Pearson's product-moment correlation
##
## data: EN_lasso$R2.x and EN_lasso$R2.y
## t = 115.98, df = 8129, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.7811664 0.7975472
## sample estimates:
## cor
## 0.7894974
ggplot(EN_lasso, aes(R2.y, R2.x)) + geom_point() + xlab("Single nested R2 from Elastic Net") + ylab("Double nested R2 from Lasso") + geom_abline()
If we generate the same figure as from before, we see that using a double nested R2 means that rats map similar to that in GTEx tissues
cis_lasso_R2 %>% count(R2 >= 0) # only 2339 genes are predicted in double nested in lasso
## # A tibble: 2 × 2
## `R2 >= 0` n
## <lgl> <int>
## 1 FALSE 12177
## 2 TRUE 2339
GWAS.df <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/GWAS_elasticnet_data_non_neg_R2_fig.txt", col_names = TRUE)
## Rows: 49 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (1): tis
## dbl (2): n.genes, n
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
tis <- c("Ac")
n.genes <- c(2339)
n <- c(78)
rats_df <- data.frame(tis, n.genes , n)
GWAS.df <- rbind(GWAS.df, rats_df)
GWAS.df <- GWAS.df %>% mutate(species = ifelse(tis=="Ac","rat","human"))
theme_set(theme_bw(base_size = 12))
ggplot(GWAS.df, mapping = aes(n, n.genes)) + geom_point(size = ifelse(GWAS.df$species == "human", 1.2, 1.5), shape = ifelse(GWAS.df$species == "human", 1, 20), color = ifelse(GWAS.df$species == "human", "dimgrey", "black")) + geom_label_repel(aes(label=ifelse(species == "rat", as.character(tis),'')), box.padding = 0.35, point.padding = 0.5) + xlab("Number of Individuals") + ylab("Number of Genes Predicted") + theme(legend.position = "None") + ggtitle("Comparison to GTEx when counting for genes with double-nested R2 >= 0")
Rats
plt_2 <- (ggplot(data = double_nested_herit_Ac, aes(x = index))
+ geom_point(aes(x=index, y=R2), colour = "red", size = 0.2)
+ geom_line(aes(y = point_estimate))
+ geom_hline(yintercept = 0, linetype=2)
+ labs(x = 'Genes Sorted by PVE',
y = 'Proportion of Variance Explained',
title = "Ac")
+ ylim(-0.5,1)
+ annotate("text", x = 300, y = 0.9, label = "Mean h2 = 0.09822", size = 2)
+ annotate("text", x = 370, y = 0.8, label = "Mean r2 = -0.02087655", size = 2)
+ annotate("text", x= 5000, y = 0.75, label = "n(rats) = 78", size = 5))
plt_2
Humans
human_herit <- load_herit(human_herit)
plt_3 <- (ggplot(data = human_herit, aes(x = index))
+ geom_point(aes(x=index, y=en_r2), colour = "red", size = 0.2)
+ geom_line(aes(y = h2))
+ geom_hline(yintercept = 0, linetype=2)
+ labs(x = 'Genes Sorted by Heritability',
y = 'Heritability',
title = "Humans")
+ ylim(-0.5,1)
+ annotate("text", x = 390, y = 0.9, label = "Mean h2 = 0.1263636", size = 2)
+ annotate("text", x = 370, y = 0.8, label = "Mean r2 = 0.1189", size = 2))
plt_3
## Warning: Removed 1451 rows containing missing values (geom_point).
gcta_herit <- read_tsv("/Users/natashasanthanam/Github/rat-genomic-analysis/data/Ac_h2_annot.txt", col_names = TRUE) %>% rename(gene = ensembl_gene_id)
## Rows: 15003 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (2): ensembl_gene_id, external_gene_name
## dbl (2): h2, se
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
gcta_herit <- inner_join(Ac_r2, gcta_herit, by = "gene")
p3 = ggplot(gcta_herit, aes(h2, R2)) + geom_hex() + xlab("GCTA Heritability") + ylab("Singled Nested Predicted R2") + geom_abline(color = "dimgrey") + ggtitle("H2 vs R2 in Rats")
p4 = ggplot(human_herit, aes(h2, en_r2)) + geom_hex() + ylab("Single Nested Predicted R2") + xlab("GCTA heritability") + geom_abline(color = "dimgrey") + ggtitle("H2 vs R2 in Humans")
ggarrange(p3, p4, ncol=2, nrow =1)
## Warning: Removed 1451 rows containing non-finite values (stat_binhex).